{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prosodic parsing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "STONES = ['parse_by_line', 'parse_by_window', 'parse_by_phrase', 'n_syll_project']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "eg_path1='/Volumes/Present/DH/corpora/chadwyck_poetry/xml/african-american/beadlesa/Z200265018.xml'\n",
    "eg_path2='/Volumes/Present/DH/corpora/chadwyck_poetry/xml/english/miscell2/Z200439011.xml'\n",
    "eg_path3='/Volumes/Present/DH/corpora/chadwyck_poetry/xml/faber/fa0101/Z200557409.xml'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAXLINES=1000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import prosodic as p"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "module 'tools' has no attribute 'config'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-9-8c18385565bd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mllp\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtools\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_to_txt_or_xml_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mpath_to_txt_or_xml_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mendswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'.xml'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m             \u001b[0mtxt\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtext_plain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_to_txt_or_xml_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/litlab/llp/llp/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdirname\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrealpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__file__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mllp\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/litlab/llp/llp/llp.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;31m#load_corpus, corpora\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/litlab/llp/llp/corpus/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdirname\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrealpath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m__file__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/litlab/llp/llp/corpus/corpus.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0mZIP_PART_DEFAULTS\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'txt'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'freqs'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'metadata'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'xml'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m \u001b[0mINSTALL_CMDS\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'metadata'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'txt'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'freqs'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'mfw'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'dtm'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mDEST_LLP_CORPORA\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'CLOUD_DEST'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'/Share/llp_corpora'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[0mMANIFEST_REQUIRED_DATA\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: module 'tools' has no attribute 'config'"
     ]
    }
   ],
   "source": [
    "from llp import tools\n",
    "def read_file(path_to_txt_or_xml_file):\n",
    "    try:\n",
    "        if path_to_txt_or_xml_file.endswith('.xml'):\n",
    "            txt=text_plain(path_to_txt_or_xml_file)\n",
    "        else:\n",
    "            #with open(path_to_txt_or_xml_file) as f:\n",
    "            #    txt=f.read()\n",
    "            txt=tools.read(path_to_txt_or_xml_file)\n",
    "        return txt\n",
    "    except (IOError,UnicodeDecodeError) as e:\n",
    "        print('!!',e)\n",
    "        print('!!',path_to_txt_or_xml_file)\n",
    "        print()\n",
    "        return ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#read_file(eg_path1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def counts(string, sub):\n",
    "    count = start = 0\n",
    "    while True:\n",
    "        start = string.find(sub, start) + 1\n",
    "        if start > 0:\n",
    "            count+=1\n",
    "        else:\n",
    "            return count"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parsing functions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Raw string parsing function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data_from_line(line,meter,require_parse_data=True):\n",
    "    \"\"\"\n",
    "    Get data from the prosodic line object, with its meter.\n",
    "    \"\"\"\n",
    "    \n",
    "    # get phonological info\n",
    "    weight_str=line.str_weight()\n",
    "    sonority_str=line.str_sonority()\n",
    "    stress_str=line.str_stress()\n",
    "\n",
    "    # store metrical constraint stats\n",
    "    bp=line.bestParse(meter)\n",
    "    \n",
    "    # require \n",
    "    if require_parse_data:\n",
    "        if not bp:\n",
    "            return {}\n",
    "        \n",
    "    ap=line.allParses(meter)\n",
    "    output_dict={}\n",
    "    output_dict['prosodic_line']=line.txt #.encode('utf-8',errors='ignore')\n",
    "    output_dict['parse']=bp.posString(viols=True) if bp else ''\n",
    "    #output_dict['parse']=output_dict['parse'] #.encode('utf-8',errors='ignore')\n",
    "    meter_str=output_dict['meter']=bp.str_meter() if bp else ''\n",
    "    output_dict['num_parses']=len(ap)\n",
    "    output_dict['num_viols'] = bp.totalCount if bp else ''\n",
    "    output_dict['score_viols'] = bp.score() if bp else ''\n",
    "    output_dict['num_sylls']=bp.num_sylls if bp else ''\n",
    "    output_dict['num_words']=len(line.words())\n",
    "    for c in meter.constraints:\n",
    "        sumviol = sum([parse.constraintCounts[c] if c in parse.constraintCounts else 0 for parse in ap])\n",
    "        output_dict[c.name_weight+'_bestparse']=bp.constraintCounts[c] if bp and c in bp.constraintCounts else 0\n",
    "        output_dict[c.name_weight+'_allparse_sum']=sumviol if sumviol else 0\n",
    "    \n",
    "    ## store phonological constraint stats\n",
    "    output_dict['prosodic_stress']=stress_str\n",
    "    output_dict['prosodic_weight']=weight_str\n",
    "    output_dict['prosodic_sonority']=sonority_str\n",
    "    output_dict['num_monosylls']=len([w for w in line.words() if w.numSyll==1])\n",
    "    output_dict['[*clash_across]']=counts(stress_str,'P#P') + counts(stress_str,'P#S') + counts(stress_str,'S#P') + counts(stress_str,'S#S')\n",
    "    output_dict['[*clash_within]']=counts(stress_str,'PP') + counts(stress_str,'PS') + counts(stress_str,'SP') + counts(stress_str,'SS')\n",
    "    output_dict['[*clash_across_primary]']=counts(stress_str,'P#P')\n",
    "    output_dict['[*clash_within_primary]']=counts(stress_str,'PP')\n",
    "    output_dict['[*lapse_across]']=counts(stress_str,'U#U')\n",
    "    output_dict['[*lapse_within]']=counts(stress_str,'UU')\n",
    "    output_dict['[*WSP]']=0\n",
    "    output_dict['[*PEAKPROM]']=0\n",
    "    output_dict['[*High_Stress]']=0\n",
    "    output_dict['[*Low_Unstress]']=0\n",
    "    output_dict['[*High_Strong]']=0\n",
    "    output_dict['[*Low_Weak]']=0\n",
    "    for s,w,hml,mtr in zip(stress_str,weight_str,sonority_str,meter_str):\n",
    "        if s=='U' and w=='H':\n",
    "            output_dict['[*WSP]']+=1\n",
    "        if (s=='P' or s=='S') and w=='L':\n",
    "            output_dict['[*PEAKPROM]']+=1\n",
    "        \n",
    "        if hml=='H' and s in {'P','S'}:\n",
    "            output_dict['[*High_Stress]']+=1\n",
    "        if hml=='L' and s==\"U\":\n",
    "            output_dict['[*Low_Unstress]']+=1\n",
    "        \n",
    "        if hml=='H' and mtr == 's':\n",
    "            output_dict['[*High_Strong]']+=1\n",
    "        if hml=='L' and mtr == 'w':\n",
    "            output_dict['[*Low_Weak]']+=1\n",
    "    \n",
    "    return output_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_string(text_str, meter='default_english', num_processes=1, maxlines=MAXLINES):\n",
    "    \"\"\"\n",
    "    Parse the string, assuming line as unit\n",
    "    \"\"\"\n",
    "    # prosodic parse\n",
    "    if maxlines: text_str='\\n'.join(text_str.split('\\n')[:maxlines])\n",
    "    text = p.Text(text_str)\n",
    "    meter = text.get_meter(meter)\n",
    "\n",
    "    out_ld=[]\n",
    "    for i,line in enumerate(text.iparse(meter=meter, num_processes=num_processes)):\n",
    "        line_d=get_data_from_line(line,meter)\n",
    "        if not line_d or not 'score_viols' in line_d: continue\n",
    "        line_d['line_id']=i+1\n",
    "        out_ld.append(line_d)\n",
    "    return out_ld"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "# pd.DataFrame(parse_string(\"\"\"With what attractive charms this goodly frame \n",
    "# Of nature touches the consenting hearts \n",
    "# Of mortal men; and what the pleasing stores \n",
    "# Which beauteous imitation thence derives \n",
    "# To deck the poet's, or the painter's toil; \n",
    "# My verse unfolds.\"\"\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### By line in text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_by_line(path_to_txt_or_xml_file, meter='default_english', num_processes=1):\n",
    "    # get txt\n",
    "    txt=read_file(path_to_txt_or_xml_file)\n",
    "    \n",
    "    # return parse\n",
    "    return parse_string(txt, meter=meter, num_processes=num_processes)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "# pd.DataFrame(parse_by_line(eg_path)).sort_values('score_viols').head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "# df_window1=pd.DataFrame(parse_by_line(eg_path1))\n",
    "# df_window2=pd.DataFrame(parse_by_line(eg_path2))\n",
    "# print(df_window1.mean()['score_viols'], df_window2.mean()['score_viols'])\n",
    "# df_window2.sort_values('num_viols').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### By window"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "def slice(l,num_slices=None,slice_length=None,runts=True,random=False):\n",
    "    \"\"\"\n",
    "    Returns a new list of n evenly-sized segments of the original list\n",
    "    \"\"\"\n",
    "    if random:\n",
    "        import random\n",
    "        random.shuffle(l)\n",
    "    if not num_slices and not slice_length: return l\n",
    "    if not slice_length: slice_length=int(len(l)/num_slices)\n",
    "    newlist=[l[i:i+slice_length] for i in range(0, len(l), slice_length)]\n",
    "    if runts: return newlist\n",
    "    return [lx for lx in newlist if len(lx)==slice_length]\n",
    "\n",
    "def ngram(l,n=3):\n",
    "    grams=[]\n",
    "    gram=[]\n",
    "    for x in l:\n",
    "        gram.append(x)\n",
    "        if len(gram)<n: continue\n",
    "        g=tuple(gram)\n",
    "        grams.append(g)\n",
    "        gram.reverse()\n",
    "        gram.pop()\n",
    "        gram.reverse()\n",
    "    return grams"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#slice(read_file(eg_path).split(), slice_length=5)\n",
    "#len(ngram(read_file(eg_path).split(), n=5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_by_window(path_to_txt_or_xml_file, meter='default_english', window_size=5,overlapping_windows=False,max_slices=100000,num_processes=1,):\n",
    "    # get txt\n",
    "    txt=read_file(path_to_txt_or_xml_file)\n",
    "    words=txt.split()\n",
    "    \n",
    "    if overlapping_windows:\n",
    "        word_slices = ngram(words,n=window_size)\n",
    "    else:\n",
    "        word_slices = slice(words,slice_length=window_size)\n",
    "        word_slices = word_slices[:max_slices]\n",
    "        \n",
    "    txt = '\\n'.join([' '.join(slicex) for slicex in word_slices])\n",
    "    \n",
    "    return parse_string(txt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "# df_window1=pd.DataFrame(parse_by_window(eg_path1))\n",
    "# df_window2=pd.DataFrame(parse_by_window(eg_path2))\n",
    "# print(df_window1.mean()['score_viols'], df_window2.mean()['score_viols'])\n",
    "# df_window2.sort_values('num_viols').head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## By phrase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Requires NLTK\n",
    "def parse_by_phrase(path_to_txt_or_xml_file, meter='default_english', minword=5):\n",
    "    # get txt\n",
    "    txt=read_file(path_to_txt_or_xml_file)\n",
    "    \n",
    "    # phrases\n",
    "    import re\n",
    "    phrases=re.split('[?.,;:\\n]', txt)\n",
    "    \n",
    "    # recombine for minword\n",
    "    if minword:\n",
    "        phrases2=[]\n",
    "        phrase=[]\n",
    "        for px in phrases:\n",
    "            phrase+=px.split()\n",
    "            if len(phrase)>=minword:\n",
    "                phrases2+=[' '.join(phrase)]\n",
    "                phrase=[]\n",
    "        phrases=phrases2\n",
    "    \n",
    "    # make txt\n",
    "    txt = '\\n'.join(phrases)\n",
    "\n",
    "    # return parsed\n",
    "    return parse_string(txt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "# df_window1=pd.DataFrame(parse_by_phrase(eg_path1))\n",
    "# df_window2=pd.DataFrame(parse_by_phrase(eg_path2))\n",
    "# print(df_window1.mean()['score_viols'], df_window2.mean()['score_viols'])\n",
    "# df_window2.sort_values('score_viols',ascending=False).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## N-Syllable Project"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def loopsum(l,sumval=10,overlapping=True,overlapping_offset=1):\n",
    "    stack=[]\n",
    "    for i,x in enumerate(l):\n",
    "        stack.append((i,x))\n",
    "        stack_sum=sum([y for x,y in stack])\n",
    "        #print(stack_sum,stack)\n",
    "        if stack_sum < sumval: continue\n",
    "        \n",
    "        while sum([y for x,y in stack]) > sumval:\n",
    "            gone=stack.pop(0)\n",
    "            _sum=sum([y for x,y in stack])\n",
    "            #print('-',gone,_sum,stack)\n",
    "            #print(_sum,stack,'--')\n",
    "            \n",
    "\n",
    "        #print('>>',stack_sum,stack)\n",
    "        stack_sum=sum([y for x,y in stack])\n",
    "        if stack_sum == sumval:\n",
    "            _sum=sum([y for x,y in stack])\n",
    "            #print(_sum,stack,'>>')\n",
    "            yield stack\n",
    "            stack=stack[overlapping_offset:] if overlapping else []\n",
    "            \n",
    "    if stack and sum([y for x,y in stack])==sumval:\n",
    "        yield stack\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def tokenize_fast(line):\n",
    "    import re\n",
    "    return re.findall(\"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\\'\\w\\-]+\",line.lower())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "def n_syll_project(path_to_txt_or_xml_file, n_syll=10,meter='default_english',\n",
    "                   within_phrases=True,within_lines=False, maxlines=MAXLINES,\n",
    "                   overlapping=False,overlapping_offset=2,shuffle_lines=True,\n",
    "                   can_end_on_unstressed=False,can_end_on_maybe_stressed=False,\n",
    "                   return_postprocessed=True):\n",
    "    \n",
    "    # get txt\n",
    "    txt=read_file(path_to_txt_or_xml_file)\n",
    "    \n",
    "    # words\n",
    "    all_tokens=tokenize_fast(txt)\n",
    "    all_types=set(all_tokens)\n",
    "    \n",
    "    # dictionary\n",
    "    pDict = p.dict['en']\n",
    "    \n",
    "    # getting the numsyll\n",
    "    word2numsyll=dict((w,pDict.get(w)[0].getNumSyll()) for w in all_types)\n",
    "    \n",
    "    # phrases\n",
    "    import re\n",
    "    if within_lines:\n",
    "        phrase_splitter='[?.,;:\\n]' if within_phrases else '[\\n]'\n",
    "    else:\n",
    "        phrase_splitter='[?.,;:]' if within_phrases else ''\n",
    "    \n",
    "    if phrase_splitter:\n",
    "        phrases=re.split(phrase_splitter, txt)\n",
    "    else:\n",
    "        phrases = [txt]\n",
    "        \n",
    "    if not can_end_on_unstressed: unstressed = set(p.dict['en'].unstressedWords)\n",
    "    if not can_end_on_maybe_stressed: maybestressed = set(p.dict['en'].unstressedWords)\n",
    "        \n",
    "    \n",
    "    # loop over phrases to produce the lines\n",
    "    LINES = []\n",
    "    for phrase in phrases:\n",
    "        phrase_words=tokenize_fast(phrase)\n",
    "        phrase_nsylls=[word2numsyll.get(w,0) for w in phrase_words]\n",
    "        \n",
    "        for stack in loopsum(phrase_nsylls,sumval=n_syll,overlapping=overlapping,overlapping_offset=overlapping_offset):\n",
    "            stack_words=[phrase_words[i] for i,x in stack]\n",
    "            if not can_end_on_unstressed and stack_words[-1] in unstressed: continue\n",
    "            if not can_end_on_maybe_stressed and stack_words[-1] in maybestressed: continue\n",
    "            line = ' '.join(stack_words)\n",
    "            #print(line)\n",
    "            LINES.append(line)\n",
    "    \n",
    "    # make txt\n",
    "    \n",
    "    print('>> # lines:',len(LINES),'in',path_to_txt_or_xml_file)\n",
    "    if shuffle_lines: random.shuffle(LINES)\n",
    "    LINES=LINES[:MAXLINES]\n",
    "    \n",
    "    #print('>> # lines:',len(LINES))\n",
    "    txt = '\\n'.join(LINES)\n",
    "    #print(txt)\n",
    "\n",
    "    # return parsed\n",
    "    data=parse_string(txt,meter=meter)\n",
    "    if return_postprocessed:\n",
    "        data=postprocess_avg((path_to_txt_or_xml_file,data))\n",
    "        #print(data)\n",
    "    return data\n",
    "    \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "#n_syll_project('ewrwrerewre')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "# pd.DataFrame(n_syll_project(eg_path3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "#n_syll_project(eg_path3,return_postprocessed=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#n_syll_project(eg_path3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Postprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def postprocess_avg(individual_slingshot_result, split_bys=['/xml/','/txt/','/_txt_chadwyck/','/_txt_ecco_tcp/','/_txt_sellars/']):\n",
    "    # imports\n",
    "    import os, pandas as pd\n",
    "    \n",
    "    # Split into two components\n",
    "    path,data=individual_slingshot_result\n",
    "    \n",
    "    # make pandas dataframe\n",
    "    df=pd.DataFrame(data)\n",
    "    #df2=pd.DataFrame(index=df.index)\n",
    "    \n",
    "    # normalize\n",
    "    for col in df.columns:\n",
    "        try:\n",
    "            df[col+'_per_100_sylls'] = df[col] / df['num_sylls'] * 100\n",
    "        except (TypeError,ValueError) as e:\n",
    "            pass\n",
    "    \n",
    "    # summarize\n",
    "    d=dict(df.mean())\n",
    "    \n",
    "    # outof\n",
    "    d['num_lines_parsed']=len(data)\n",
    "    \n",
    "    # add id\n",
    "    #split_fn_by='/txt/'\n",
    "    idx=os.path.splitext(path)[0]\n",
    "    for split_fn_by in split_bys:\n",
    "        idx=idx.split(split_fn_by)[-1]\n",
    "    \n",
    "    d['id'] = idx\n",
    "    \n",
    "    # return dict\n",
    "    return d\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
